import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# load data and get some insight
pdata = pd.read_csv("bank-full.csv")
pdata.head()
unique_values = pdata.nunique()
print('Count unique values in each column')
print(unique_values)
pdata.describe().transpose()
Look at the data we can see we have outliers for balance, pdays and duration
# check for null value
pdata.isnull().sum()
Base on the sumarize above, we do not see we have any bad or missing data.
pdata.shape
pdata['Target'].value_counts()
pdata['pdays'].value_counts()
We notice the pdays is so imbalanced, if we train without fixing this the model would be biased
pdata['previous'].value_counts()
pdata.hist(bins=20, figsize=(16,12))
plt.show()
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
# Clean duration ouliers
pdata_clean = remove_outlier(pdata,'duration')
pdata_clean.head()
pdata_clean = remove_outlier(pdata_clean, 'balance')
pdata_clean.head()
# I noticed there some outliers which has value > 50 so filter those out
pdata_clean = pdata_clean[pdata_clean['previous'] < 50]
pdata_clean.info()
def countplot(label, using_hue=False):
plt.figure(figsize=(15,5))
total = len(pdata_clean[label])
if using_hue:
ax = sns.countplot(x=label, data=pdata_clean, hue='Target')
else:
ax = sns.countplot(x=label, data=pdata_clean)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100 * p.get_height()/total), (p.get_x() + 0.1, p.get_height() + 5))
ax.yaxis.set_ticks(np.linspace(0, total, 11))
ax.set_yticklabels(map('{:.1f}%'.format, 100 * ax.yaxis.get_majorticklocs()/total))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.show()
countplot('Target')
# Count by marital
countplot("marital")
# included target so we can see how marital might impact Target
countplot("marital", True)
# Count by job
countplot("job")
# included target so we can see how job might impact Target
countplot('job', True)
countplot("education")
# included target so we can see how education might impact Target
countplot("education", True)
countplot('default')
# included target so we can see how default might impact Target
countplot('default', True)
From above chart we can tell default (has credit or not) play an important role for customers to subscribe to term deposit
countplot('housing')
# included target so we can see how housing might impact Target
countplot('housing', True)
countplot('loan')
countplot('loan', True)
From above we can see loan is an important feature to tell customers subscribe to term deposit or not
countplot('contact')
countplot('contact', True)
From above chart we can tell contact customer by cell play an important role to tell customers subscribe to term or not
countplot('poutcome')
countplot('poutcome', True)
pdata_clean.hist(bins=20, figsize=(16,12))
plt.show()
sns.pairplot(pdata_clean, hue="Target", palette="husl")
sns.boxplot(x='Target', y='age', data=pdata_clean)
From above chart, we can see most of customers subscribe or not are between 30-50 years old and seem they have overlapped, so age is not a good feature to include into our training model
sns.boxplot(x='Target', y='duration', data=pdata_clean)
sns.boxplot(x='Target', y='previous', data=pdata_clean)
sns.boxplot(x='Target', y='pdays', data=pdata_clean)
plt.figure(figsize=(10,8))
sns.distplot(pdata_clean["duration"])
plt.show()
fig = plt.figure(figsize=(20,10))
fig1 = sns.boxplot(x="job", y="balance", hue="Target", data=pdata)
fig1.set_xticklabels(pdata_clean["job"].unique(), rotation=45, rotation_mode="anchor")
fig = plt.figure(figsize=(20,10))
fig2 = sns.boxplot(x="education", y="balance", hue="Target", data=pdata)
fig2.set_xticklabels(pdata_clean["job"].unique(), rotation=45, rotation_mode="anchor")
fig = plt.figure(figsize=(20,10))
fig3 = sns.boxplot(x="job", y="age", hue="Target", data=pdata)
fig3.set_xticklabels(pdata["job"].unique(), rotation=45, rotation_mode="anchor")
I notice retired and student are out of normal range
fig = plt.figure(figsize=(20,10))
fig4 = sns.boxplot(x="marital", y="balance", hue="Target", data=pdata)
fig4.set_xticklabels(pdata_clean["marital"].unique(), rotation=45, rotation_mode="anchor")
corr_matrix = pdata_clean.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
Look at the correlation matrix for those numerical features, we can see there is a high correlation of pdays and previous compare with the rest, we can try to drop it and see if the prediction score improve
pdata_clean.replace({'no': 0, 'yes': 1 }, inplace=True)
pdata_clean.head()
pdata_clean.info()
pdata_model = pd.get_dummies(pdata_clean, drop_first=True)
pdata_model.head()
X = pdata_model.drop('Target', axis=1)
Y = pdata_model['Target'].astype('category')
# Split data to train, test with ratio 70/30
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
X_train.head()
# fit logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
model_score = model.score(X_test, Y_test)
print(model_score)
from sklearn import tree
dTree = tree.DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, Y_train)
print(dTree.score(X_train, Y_train))
print(dTree.score(X_test, Y_test))
# Test accuracy for LogReg
y_pred_df_logReg = pd.DataFrame()
y_pred_df_logReg['actual'] = Y_test
y_pred_df_logReg["predicted_prob"] = model.predict_proba(X_test)[:,1]
y_pred_df_logReg['predicted'] = y_pred_df_logReg.predicted_prob.map( lambda x: 1 if x > 0.6 else 0)
# Converted values which are above 0.6 as '1' and rest as '0'
y_pred_df_logReg[0:5]
# Evaluating model performance by using confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted, [1,0] )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["Yes", "No"] , yticklabels = ["Yes", "No"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_cm(y_pred_df_logReg.actual, y_pred_df_logReg.predicted)
# Test accuracy for DecisionTree
y_pred_df_dTree = pd.DataFrame()
y_pred_df_dTree['actual'] = Y_test
y_pred_df_dTree["predicted_prob"] = dTree.predict_proba(X_test)[:,1]
y_pred_df_dTree['predicted'] = y_pred_df_dTree.predicted_prob.map( lambda x: 1 if x > 0.6 else 0)
# Converted values which are above 0.6 as '1' and rest as '0'
y_pred_df_dTree[0:5]
# Evaluating model performance by using confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted, [1,0] )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["Yes", "No"] , yticklabels = ["Yes", "No"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_cm(y_pred_df_dTree.actual, y_pred_df_dTree.predicted)
# RandomForest
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(criterion = 'entropy', class_weight={0:.5,1:.5}, max_depth = 5, min_samples_leaf=5)
rfcl = rfcl.fit(X_train, Y_train)
test_pred = rfcl.predict(X_test)
rfcl.score(X_test , Y_test)
rf_score = rfcl.score(X_test , Y_test)
rf_metric = "criterion = 'entropy', class_weight={0:.5,1:.5}, max_depth = 5, min_samples_leaf=5"
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=20)
abcl = abcl.fit(X_train, Y_train)
test_pred = abcl.predict(X_train)
abcl.score(X_test , Y_test)
abcl_score = abcl.score(X_test , Y_test)
abcl_metric = "n_estimators=20"
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=20, max_samples= .7, bootstrap=True)
bgcl = bgcl.fit(X_train, Y_train)
test_pred = bgcl.predict(X_test)
bgcl.score(X_test , Y_test)
bgcl_score = bgcl.score(X_test , Y_test)
bgcl_metric = "n_estimators=20, max_samples= .7, bootstrap=True"
data = {'Name': ['RandomForest', 'AdaBoost', 'Bagging'],
'Score': [rf_score, abcl_score, bgcl_score],
'Metric': [rf_metric,abcl_metric,bgcl_metric]}
df_summary = pd.DataFrame(data)
df_summary.head()
Look like Bagging is the winner with the best score compares with other two